Refer to review slides for definitions.
getModes(mtcars$cyl)
## [1] 8
table(iris$Species) %>% data.frame() %>% gt() %>% cols_label(Var1 = 'Species')
| Species | Freq |
|---|---|
| setosa | 50 |
| versicolor | 50 |
| virginica | 50 |
ggplot(iris, aes(Species))+
geom_histogram(stat = "count", col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Plant Species')+
theme_bw()
bins <- seq(10,34,by = 2)
mpg <- cut(mtcars$mpg,bins)
freqDist <- table(mpg) %>% data.frame()
gt(freqDist)
| mpg | Freq |
|---|---|
| (10,12] | 2 |
| (12,14] | 1 |
| (14,16] | 7 |
| (16,18] | 3 |
| (18,20] | 5 |
| (20,22] | 5 |
| (22,24] | 2 |
| (24,26] | 2 |
| (26,28] | 1 |
| (28,30] | 0 |
| (30,32] | 2 |
| (32,34] | 2 |
# Binning using ggplot
ggplot(mtcars, aes(mpg))+
geom_histogram(binwidth = 2,col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Miles Per Gallon', caption = "1974 Motor Trend US Magazine")+
theme_bw()
# Using manual binning method
ggplot(freqDist, aes(mpg,Freq))+
geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Miles Per Gallon',
caption = "Data is from 1974 Motor Trend US Magazine",
y = 'Frequency',
x= "Miles Per Gallon")+
theme_bw()
#Symmetric
rating <- c(rep(1,10), rep(2,20), rep(3,30),rep(4,20),rep(5,10))
freqDist <- table(rating) %>% data.frame()
gt(freqDist)
| rating | Freq |
|---|---|
| 1 | 10 |
| 2 | 20 |
| 3 | 30 |
| 4 | 20 |
| 5 | 10 |
ggplot(freqDist, aes(rating,Freq))+
geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Ratings for Local Restaurant')+
theme_bw()
#Asymmetric
rating <- c(rep(1,10), rep(2,20), rep(3,30),rep(4,60),rep(5,35))
freqDist <- table(rating) %>% data.frame()
gt(freqDist)
| rating | Freq |
|---|---|
| 1 | 10 |
| 2 | 20 |
| 3 | 30 |
| 4 | 60 |
| 5 | 35 |
ggplot(freqDist, aes(rating,Freq))+
geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Ratings for Local Restaurant')+
theme_bw()
#Symmetric
rating <- c(rep(1,20), rep(2,40), rep(3,10),rep(4,40),rep(5,20))
freqDist <- table(rating) %>% data.frame()
gt(freqDist)
| rating | Freq |
|---|---|
| 1 | 20 |
| 2 | 40 |
| 3 | 10 |
| 4 | 40 |
| 5 | 20 |
ggplot(freqDist, aes(rating,Freq))+
geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Ratings for Local Restaurant')+
theme_bw()
#Asymmetric
rating <- c(rep(1,30), rep(2,10), rep(3,30),rep(4,15),rep(5,5))
freqDist <- table(rating) %>% data.frame()
gt(freqDist)
| rating | Freq |
|---|---|
| 1 | 30 |
| 2 | 10 |
| 3 | 30 |
| 4 | 15 |
| 5 | 5 |
ggplot(freqDist, aes(rating,Freq))+
geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
labs(title = 'Distribution of Ratings for Local Restaurant')+
theme_bw()
# Base
summary(mtcars %>% select(mpg, cyl,hp))
## mpg cyl hp
## Min. :10.40 Min. :4.000 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :123.0
## Mean :20.09 Mean :6.188 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :335.0
#Upgraded
mtcars%>% select(mpg, cyl,hp) %>%
tbl_summary(statistic = list(all_continuous() ~ c("{mean} ({sd})",
"{median} ({p25}, {p75})",
"{min}, {max}"),
all_categorical() ~ "{n} / {N} ({p}%)"),
type = all_continuous() ~ "continuous2"
)
| Characteristic | N = 32 |
|---|---|
| mpg | |
| Mean (SD) | 20.1 (6.0) |
| Median (IQR) | 19.2 (15.4, 22.8) |
| Range | 10.4, 33.9 |
| cyl | |
| 4 | 11 / 32 (34%) |
| 6 | 7 / 32 (22%) |
| 8 | 14 / 32 (44%) |
| hp | |
| Mean (SD) | 147 (69) |
| Median (IQR) | 123 (96, 180) |
| Range | 52, 335 |
Sometimes it is easier to view this data in a box plot. Potential outliers are recognized outside the upper and lower fence. The upper and lower fences are calculated as follows:
\[Upper Fence = Q3 + (1.5 * IQR)\] \[Lower Fence = Q1 – (1.5 * IQR)\]
plot_ly(y = starwars$height, type = 'box', name = 'Height [cm]',text = starwars$name) %>%
layout(title = 'Distribution of Star Wars Character Heights')
plot_ly(starwars, y = ~mass, x = ~height, type = 'scatter',text = ~name)
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
ggplot(starwars,aes(height, mass))+
geom_point(color = 'gray40')+
geom_smooth(method = "lm", se = F, color = 'darkblue')+
theme_bw()+
labs(title = "Relationship between Mass and Height of Star Wars Characters")
## `geom_smooth()` using formula 'y ~ x'
cor(starwars$height,starwars$mass, use = "complete.obs")
## [1] 0.1338842
starwarsNoJabba <- starwars %>% filter(mass < 400)
ggplot(starwarsNoJabba,aes(height, mass))+
geom_point(color = 'gray40')+
geom_smooth(method = "lm", se = F, color = 'darkblue')+
theme_bw()+
labs(title = "Relationship between Mass and Height of Star Wars Characters",
caption = "*Excluding Jabba the Hutt")
## `geom_smooth()` using formula 'y ~ x'
cor(starwarsNoJabba$height,starwarsNoJabba$mass, use = "complete.obs")
## [1] 0.7612612
?sp500
ggplot(sp500, aes(date, high))+
geom_line(color = 'gray40',alpha = 0.75)+
geom_smooth(method = "lm", se = F, color = 'darkblue', linetype = 'dashed')+
theme_bw()+
labs(title = "Trading Volume of the S&P 500 Over Time",
subtitle = 'from 1950-2015',
x = "Date",
y = "Volume")
## `geom_smooth()` using formula 'y ~ x'